ohibc logo
OHI British Columbia | OHI Science | Citation policy

knitr::opts_chunk$set(fig.width = 6, fig.height = 4, fig.path = 'Figs/',
                      echo = TRUE, message = FALSE, warning = FALSE)

library(raster)
library(data.table)


source('~/github/src/R/common.R')  ###
  ### includes library(tidyverse); library(stringr); dir_M points to ohi directory

dir_git <- '~/github/spp_health_dists'

dir_am_data <- file.path(dir_M, 'git-annex/globalprep/_raw_data/aquamaps/d2017')

### goal specific folders and info
dir_setup <- file.path(dir_git, 'data_setup')
dir_data  <- file.path(dir_git, 'data')
dir_o_anx <- file.path(dir_O, 'git-annex/spp_health_dists')
# ### provenance tracking
# library(provRmd); prov_setup()

### support scripts
source('~/github/src/R/rast_tools.R')
  ### raster plotting and analyzing scripts

1 Summary

Create a map of the distribution of biodiversity - all species in AquaMaps assessed by IUCN (using AquaMaps version of IUCN ID numbers). These maps will be generated using all taxonomic groups:

  • All species:
    • Number of species (species richness)
    • Range rarity
    • Relative range rarity
  • IUCN-assessed species:
    • Mean risk
    • Variance of risk
    • Number of species

The following maps will also be generated for taxonomic groups (IUCN-assessed species only): * Mean risk * Variance of risk * Number of species

2 Data Sources

3 Methods

3.1 Spatial distribution of all AquaMaps species

3.1.1 Species richness

Species richness as the raw count of species indicated to exist in a particular cell. This is calculated for all species included in the AquaMaps dataset, regardless of IUCN assessment status.

We can consider species presence in several ways:

  • all species indicated as present at any level (i.e. any non-zero probability of occurrence)
  • all species indicated as “core habitat” (i.e. 60% or greater probability of occurrence)
  • all species possibly present, with presence weighted by probability of occurrence
spp_richness_file <- file.path(dir_data, 'am_all_spp_richness.csv')

if(!file.exists(spp_richness_file)) {
  am_spp_cells <- fread(file.path(dir_o_anx, 'am_files/am_spp_cells_all.csv'))
  
  spp_richness_0pct_probwt <- am_spp_cells %>%
    group_by(loiczid) %>%
    summarize(n_spp_0pct = n(),
              n_spp_probwt = sum(prob))
  
  spp_richness_60pct <- am_spp_cells %>%
    filter(prob >= .60) %>%
    group_by(loiczid) %>%
    summarize(n_spp_60pct = n())
  
  spp_richness <- spp_richness_0pct_probwt %>%
    full_join(spp_richness_60pct, by = 'loiczid')
  
  write_csv(spp_richness, spp_richness_file)

}
spp_richness_file <- file.path(dir_data, 'am_all_spp_richness.csv')
spp_richness <- read_csv(spp_richness_file, col_types = 'dddd') %>%
  mutate(log_n_probwt = log(n_spp_probwt))

loiczid_rast <- raster(file.path(dir_git, 'spatial/loiczid_raster.tif'))


rich_rast_0pct   <- subs(loiczid_rast, spp_richness, by = 'loiczid', which = 'n_spp_0pct')
rich_rast_60pct  <- subs(loiczid_rast, spp_richness, by = 'loiczid', which = 'n_spp_60pct')
rich_rast_probwt <- subs(loiczid_rast, spp_richness, by = 'loiczid', which = 'n_spp_probwt')
rich_rast_log_probwt <- subs(loiczid_rast, spp_richness, by = 'loiczid', which = 'log_n_probwt')

plot(rich_rast_0pct, main = 'Richness, All species, any presence')

plot(rich_rast_60pct, main = 'Richness, All species, prob of occur >= 60%')

plot(rich_rast_probwt, main = 'Richness, All species, weighted by prob of occur')

plot(rich_rast_log_probwt, main = 'Richness, All species, log(prob of occur weighted))')

3.1.2 Species range rarity

Species range rarity is a measure of endemism that weighs the presence of a species in a particular area relative to its entire range. It is defined in Selig et al 2014 (and referenced from Williams et al. 1996: Promise and problems in applying quantitative complementary areas for representing the diversity of some neotropical plants)

\[Range \hspace{3pt} rarity_{cell} = \sum_{i=1}^N \frac{1}{A_i} \times w\]

Relative range rarity is range rarity divided by species richness to remove confounding effects (again as defined in Selig et al 2014).

As in Selig et al 2014, we will multiply rarity by 100,000 and relative rarity by 1000 for analytical convenience.

This is calculated for all species included in the AquaMaps dataset, regardless of IUCN assessment status.

We will calculate range rarity using the same three cuts as for richness; for each cut, species presence, total range, and richness will all be calculated using the same cut method.

spp_rarity_file <- file.path(dir_data, 'am_all_spp_rarity.csv')

if(!file.exists(spp_rarity_file)) {
  if(!exists('am_spp_cells')) {
    am_spp_cells <- fread(file.path(dir_o_anx, 'am_files/am_spp_cells_all.csv'))
  }
  
  am_spp_ranges <- read_csv(file.path(dir_data, 'am_spp_range_summary.csv'))
  
  spp_cells_ranges <- am_spp_cells %>%
    left_join(am_spp_ranges, by = 'am_sid')
  
  spp_rarity_0pct_probwt <- spp_cells_ranges %>%
    group_by(loiczid) %>%
    summarize(rarity_0pct   = sum(  1  / area_km2_0pct),
              rarity_probwt = sum(prob / area_km2_probwt))
  
  spp_rarity_60pct <- spp_cells_ranges %>%
    filter(prob >= .60) %>%
    group_by(loiczid) %>%
    summarize(rarity_60pct = sum( 1 / area_km2_60pct, na.rm = TRUE))
  
  spp_rarity <- spp_rarity_0pct_probwt %>%
    full_join(spp_rarity_60pct, by = 'loiczid') %>%
    mutate(rarity_0pct   = rarity_0pct * 100000,
           rarity_60pct  = rarity_60pct * 100000,
           rarity_probwt = rarity_probwt * 100000)
  
  write_csv(spp_rarity, spp_rarity_file)

}
spp_rarity   <- read_csv(file.path(dir_data, 'am_all_spp_rarity.csv'),
                         col_types = 'dddd')
spp_richness <- read_csv(file.path(dir_data, 'am_all_spp_richness.csv'),
                         col_types = 'dddd')

spp_rel_rare <- spp_rarity %>%
  left_join(spp_richness, by = 'loiczid') %>%
  mutate(rel_rare_0pct   = rarity_0pct / n_spp_0pct * 1000,
         rel_rare_60pct  = rarity_60pct / n_spp_60pct * 1000,
         rel_rare_probwt = rarity_probwt / n_spp_probwt * 1000) %>%
  select(loiczid, contains('rel_rare'))

write_csv(spp_rel_rare, file.path(dir_data, 'am_all_spp_rel_rarity.csv'))
spp_rarity   <- read_csv(file.path(dir_data, 'am_all_spp_rarity.csv'),
                         col_types = 'dddd')
spp_rel_rare <- read_csv(file.path(dir_data, 'am_all_spp_rel_rarity.csv'),
                         col_types = 'dddd')

loiczid_rast <- raster(file.path(dir_git, 'spatial/loiczid_raster.tif'))


rare_rast_0pct   <- subs(loiczid_rast, spp_rarity, 
                         by = 'loiczid', which = 'rarity_0pct')
rare_rast_60pct  <- subs(loiczid_rast, spp_rarity, 
                         by = 'loiczid', which = 'rarity_60pct')
rare_rast_probwt <- subs(loiczid_rast, spp_rarity, 
                         by = 'loiczid', which = 'rarity_probwt')

plot(rare_rast_0pct, main = 'Range rarity, All species, any presence')

plot(rare_rast_60pct, main = 'Range rarity, All species, prob of occur >= 60%')

plot(rare_rast_probwt, main = 'Range rarity, All species, weighted by prob of occur')

### relative range rarity
rel_rare_rast_0pct   <- subs(loiczid_rast, spp_rel_rare, 
                         by = 'loiczid', which = 'rel_rare_0pct')
rel_rare_rast_60pct  <- subs(loiczid_rast, spp_rel_rare, 
                         by = 'loiczid', which = 'rel_rare_60pct')
rel_rare_rast_probwt <- subs(loiczid_rast, spp_rel_rare, 
                         by = 'loiczid', which = 'rel_rare_probwt')

plot(rel_rare_rast_0pct, main = 'Rel range rarity, All spp, any presence')

plot(rel_rare_rast_60pct, main = 'Rel range rarity, All spp, prob of occur >= 60%')

plot(rel_rare_rast_probwt, main = 'Rel range rarity, All spp, weighted by prob of occur')

3.2 Spatial distribution of current extinction risk

3.2.1 Aggregate mean risk and variance by cell

As in the species richness and rarity calculations, we will calculate mean risk using three different considerations of species presence: all presence (non-zero probability of occurrence), “core habitat” (60% or greater prob of occurrence), and probability-weighted.

This binds rows into a long dataframe, but the saved file is rather large. By rounding the decimals a bit, we can shave off quite a bit of file size.

iucn_version <- '2017-3'

cell_risk_file <- file.path(dir_data, sprintf('risk_by_cell_all_%s.csv', iucn_version))


iucn_to_am_lookup <- read_csv(file.path(dir_setup, 'int', 
                                        'am_iucn_crosslisted_spp.csv')) %>%
  select(am_sid = speciesid, iucn_sid = iucn_id) %>%
  distinct()

spp_current_risk <- read_csv(file.path(dir_data, 
                                       sprintf('iucn_risk_current_%s.csv', iucn_version))) %>%
  select(iucn_sid, cat, cat_score) %>%
  left_join(iucn_to_am_lookup, by = 'iucn_sid') %>%
  as.data.table()

spp_cells <- fread(file.path(dir_o_anx, 'am_files', 'am_spp_cells_assessed.csv'))
## 
Read 43.3% of 16116125 rows
Read 79.4% of 16116125 rows
Read 16116125 rows and 3 (of 3) columns from 0.338 GB file in 00:00:04
# git_prov(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'), filetype = 'input')

### using data.table join:
spp_cells_risk <- spp_current_risk[spp_cells, on = 'am_sid'] %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid))

spp_cells_risk_sum_0pct <- spp_cells_risk %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid)) %>%
  # filter(prob >= 0.60) %>%
  group_by(loiczid) %>%
  summarize(n_spp     = n(), 
            mean_risk = mean(cat_score),
            var_risk  = var(cat_score),
            presence  = '0%')

spp_cells_risk_sum_60pct <- spp_cells_risk %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid)) %>%
  filter(prob >= 0.60) %>%
  group_by(loiczid) %>%
  summarize(n_spp     = n(), 
            mean_risk = mean(cat_score),
            var_risk  = var(cat_score),
            presence  = '60%')

### For the prob weighted calculations, weight the mean and var:
###     mean = 1/n * sum(y)
### ==> mean = 1/sum(prob) * sum(y * prob)
###      var = 1/n * sum[y - E(y)]^2
### ==>  var = 1/sum(prob) * sum([y - E(y)]^2 * prob)
spp_cells_risk_sum_probwt <- spp_cells_risk %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid)) %>%
  # filter(prob >= 0.60) %>%
  group_by(loiczid) %>%
  summarize(n_spp     = sum(prob),
            mean_risk = 1/n_spp * sum(cat_score * prob),
            var_risk  = 1/n_spp * sum((cat_score - mean_risk)^2 * prob),
            presence  = 'prob')

spp_cells_risk_sum <- bind_rows(spp_cells_risk_sum_0pct, 
                                spp_cells_risk_sum_60pct, 
                                spp_cells_risk_sum_probwt) %>%
  mutate(var_risk = ifelse(mean_risk == 0 & is.na(var_risk), 0, var_risk),
         mean_risk = round(mean_risk, 6),
         var_risk  = round(var_risk, 6))
    ### if mean risk == 0, all risks are zero, so zero variance.

write_csv(spp_cells_risk_sum, cell_risk_file)

3.2.2 Mean risks and variance maps

loiczid_rast <- raster(file.path(dir_git, 'spatial/loiczid_raster.tif'))

spp_cells_risk_sum <- read_csv(cell_risk_file,
                         col_types = 'idddc')

cuts <- c('0%', '60%', 'prob')

for(cut in cuts) { ### cut <- cuts[3]
  df_cut <- spp_cells_risk_sum %>%
    filter(presence == cut)
  
  cat(sprintf('#### Mean, variance, and richness based on %s', cut))
  
  risk_rast <- subs(loiczid_rast, df_cut, by = 'loiczid', which = 'mean_risk')
  var_rast  <- subs(loiczid_rast, df_cut, by = 'loiczid', which = 'var_risk')
  nspp_rast <- subs(loiczid_rast, df_cut, by = 'loiczid', which = 'n_spp')
  
  plot(risk_rast, main = sprintf('Mean risk, cut = %s', cut))
  plot(var_rast,  main = sprintf('Variance of risk, cut = %s', cut))
  plot(nspp_rast, main = sprintf('Assessed richness, cut = %s', cut))

}

3.2.2.1 Mean, variance, and richness based on 0%#### Mean, variance, and richness based on 60%#### Mean, variance, and richness based on prob

3.3 Spatial distribution of trend in extinction risk

3.3.1 Aggregate mean trend by cell

Again we will use a probability threshold of 0.60 to determine species presence. We filter out any species with NA for iucn_sid or trend_score.

cell_trend_file <- file.path(dir_data, 'spp_trend_by_loiczid.csv')

iucn_to_am_lookup <- read_csv(file.path(dir_data, 'am_iucn_crosslisted_spp.csv')) %>%
  select(am_sid = speciesid, iucn_sid = iucn_id) %>%
  distinct()

spp_trend <- read_csv(file.path(dir_data, 'trend_by_spp.csv')) %>%
  left_join(iucn_to_am_lookup, by = 'iucn_sid') %>%
  as.data.table()

if(!exists('spp_cells')) {
  spp_cells <- fread(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'))
}
# git_prov(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'), filetype = 'input')

### using data.table join:
spp_cells_trend <- spp_trend[spp_cells, on = 'am_sid']

spp_cells_trend_sum <- spp_cells_trend %>%
  filter(!is.na(trend_score) & !is.na(iucn_sid)) %>%
  filter(prob >= 0.60) %>%
  group_by(loiczid) %>%
  summarize(n_spp = n(), mean_trend = mean(trend_score))

write_csv(spp_cells_trend_sum, cell_trend_file)

3.4 Calculate means by species range

3.4.1 Aggregate mean risk and variance by cell

Here we will use a probability threshold of 0.60 to determine species presence. We filter out any species with NA for iucn_sid or cat_score.

cell_risk_by_range_file <- c(file.path(dir_data, 'risk_by_cell_by_range.csv'))

iucn_to_am_lookup <- read_csv(file.path(dir_data, 'am_iucn_crosslisted_spp.csv')) %>%
  select(am_sid = speciesid, iucn_sid = iucn_id) %>%
  distinct()

spp_ranges <- read_csv(file.path(dir_data, 'am_spp_range_summary.csv')) %>%
  arrange(area_km2) %>%
  mutate(cum_area = cumsum(area_km2),
         range_gp = case_when(cum_area <= max(cum_area) * .33 ~ 'small',
                              cum_area <= max(cum_area) * .67 ~ 'medium',
                              TRUE                       ~ 'large')) %>%
  select(am_sid, iucn_sid, area_km2, range_gp)
# summary(spp_ranges %>% filter(range_gp == 'small') %>% .$area_km2)
#      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
#     2522     400481   1180425   2192586   3739745   7681908
# summary(spp_ranges %>% filter(range_gp == 'medium') %>% .$area_km2)
#      Min.   1st Qu.    Median      Mean   3rd Qu.       Max. 
#   7690359   8286343   9184440  15440048  12563147  103892977 
# summary(spp_ranges %>% filter(range_gp == 'large') %>% .$area_km2)
#      Min.   1st Qu.    Median      Mean   3rd Qu.       Max. 
# 105087230 136623452 159164037 167382766 196088390  245559322

spp_current_risk <- read_csv(file.path(dir_data, 'iucn_spp_cat_current.csv')) %>%
  select(iucn_sid, cat, cat_score) %>%
  left_join(iucn_to_am_lookup, by = 'iucn_sid') %>%
  left_join(spp_ranges) %>%
  as.data.table()

if(!exists('spp_cells')) {
  spp_cells <- fread(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'))
}
# git_prov(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'), filetype = 'input')

### using data.table join:
spp_cells_risk <- spp_current_risk[spp_cells, on = 'am_sid']

spp_cells_risk_sum <- spp_cells_risk %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid)) %>%
  filter(prob >= 0.60) %>%
  group_by(loiczid, range_gp) %>%
  summarize(n_spp = n(), 
            mean_risk = mean(cat_score))

write_csv(spp_cells_risk_sum, cell_risk_by_range_file)

3.4.2 Load raster and substitute mean risks

loiczid_rast <- raster(file.path(dir_git, 'spatial/loiczid_raster.tif'))

### gotta force the mean_risk column to be double; there are lots of zero
### values so will default to thinking that column is integer.
risk_by_cell_by_range <- read_csv(file.path(dir_data, 'risk_by_cell_by_range.csv'),
                                  col_types = 'dcdd')

for(range_size in c('small', 'medium', 'large')) {
  ### range_size <- 'medium'
  tmp <- risk_by_cell_by_range %>%
    filter(range_gp == range_size) %>%
    filter(n_spp >= 5)
  
  risk_rast <- subs(loiczid_rast, tmp, 
                    by = 'loiczid', which = 'mean_risk')
  nspp_rast <- subs(loiczid_rast, tmp, 
                    by = 'loiczid', which = 'n_spp')
  
  plot(risk_rast, main = paste0('Mean risk for spp w/range ', range_size, ' (n >= 5)'))
  plot(nspp_rast, main = paste0('Spp count for spp w/range ', range_size, ' (n >= 5)'))

}

3.5 Calculate means by taxa

3.5.1 Aggregate mean risk and variance by cell

Here we will use a probability threshold of 0.60 to determine species presence. We filter out any species with NA for iucn_sid or cat_score.

cell_risk_by_taxa_file <- c(file.path(dir_data, 'risk_by_cell_by_taxa.csv'))

iucn_to_am_lookup <- read_csv(file.path(dir_data, 'am_iucn_crosslisted_spp.csv')) %>%
  select(am_sid = speciesid, iucn_sid = iucn_id) %>%
  distinct()

spp_taxa <- read_csv(file.path(dir_data, 'am_spp_taxa.csv')) %>%
  select(am_sid, iucn_sid, taxa) %>%
  distinct()

spp_current_risk <- read_csv(file.path(dir_data, 'iucn_spp_cat_current.csv')) %>%
  select(iucn_sid, cat, cat_score) %>%
  left_join(iucn_to_am_lookup, by = 'iucn_sid') %>%
  left_join(spp_taxa) %>%
  as.data.table()

if(!exists('spp_cells')) {
  spp_cells <- fread(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'))
}
# git_prov(file.path(dir_o_anx, 'am_files', 'am_spp_cells.csv'), filetype = 'input')

### using data.table join:
spp_cells_risk <- spp_current_risk[spp_cells, on = 'am_sid']

spp_cells_risk_sum <- spp_cells_risk %>%
  filter(!is.na(cat_score) & !is.na(iucn_sid)) %>%
  filter(prob >= 0.60) %>%
  group_by(loiczid, taxa) %>%
  summarize(n_spp = n(), 
            mean_risk = mean(cat_score))

write_csv(spp_cells_risk_sum, cell_risk_by_taxa_file)

3.5.2 Load raster and substitute mean risks

loiczid_rast <- raster(file.path(dir_git, 'spatial/loiczid_raster.tif'))

### gotta force the mean_risk column to be double; there are lots of zero
### values so will default to thinking that column is integer.
risk_by_cell_by_taxa <- read_csv(file.path(dir_data, 'risk_by_cell_by_taxa.csv'),
                                 col_types = 'dcdd')

taxa_list <- risk_by_cell_by_taxa$taxa %>% unique()

for(taxa_gp in taxa_list) {
  ### taxa_gp <- 'mammal'
  tmp <- risk_by_cell_by_taxa %>%
    filter(taxa == taxa_gp) %>%
    filter(n_spp >= 0)
  
  risk_rast <- subs(loiczid_rast, tmp, 
                    by = 'loiczid', which = 'mean_risk')
  nspp_rast <- subs(loiczid_rast, tmp, 
                    by = 'loiczid', which = 'n_spp')
  
  plot(risk_rast, main = paste0('Mean risk for spp in ', taxa_gp, ' (n >= 0)'))
  plot(nspp_rast, main = paste0('Spp count for spp in ', taxa_gp, ' (n >= 0)'))

}

# prov_wrapup(commit_outputs = FALSE)